home *** CD-ROM | disk | FTP | other *** search
/ Enter 2006 September / Enter 09 2006.iso / Internet / SpamExperts Home 1.1 / SpamExperts Home.exe / lib / spamexperts.modules / spamexperts / ProxyClassifier.pyc (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2006-07-14  |  16.8 KB  |  427 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.4)
  3.  
  4. from __future__ import division
  5. import sys
  6. import time
  7. import email
  8. import Queue
  9. import socket
  10. import xmlrpclib
  11. import threading
  12. import traceback
  13. import win32api
  14. import win32process
  15. from spambayes import Dibbler
  16. from spambayes.storage import NO_TRAINING_FLAG
  17. from spambayes.message import insert_exception_header
  18. import se_config
  19. from spamexperts import message
  20. from spamexperts import dnslookup
  21. from spamexperts.Options import options
  22. from spamexperts.fingerprint import fingerprint
  23. from spamexperts.dnsclassifier import DNSClassifier
  24. from spamexperts.OptionsClass import BLOCKED, DELAYED
  25. from spamexperts.OptionsClass import IS_HAM, IS_SPAM, IS_UNSURE, IS_UNKNOWN
  26.  
  27. try:
  28.     _
  29. except NameError:
  30.     
  31.     _ = lambda x: x
  32.  
  33.  
  34. class ProxyClassifier(object):
  35.     current_account = None
  36.     HEADER_SIZE_FUDGE_FACTOR = 512
  37.     fingerprint_engine = fingerprint.Fingerprint()
  38.     DNS = DNSClassifier()
  39.     state = None
  40.     fingerprints = 0
  41.     fingerprint_failures = 0
  42.     fingerprint_disabled = None
  43.     processing_queue = Queue.Queue()
  44.     processing_thread = None
  45.     finish_processing = False
  46.     processed_ids = []
  47.     
  48.     def init(cls):
  49.         cls.processing_thread = threading.Thread(target = cls.process_queue)
  50.         cls.processing_thread.setDaemon(True)
  51.         cls.processing_thread.start()
  52.  
  53.     init = classmethod(init)
  54.     
  55.     def _get_delayed_db(self):
  56.         return self.state.delayed_messages[self.current_account]
  57.  
  58.     
  59.     def _get_blocked_db(self):
  60.         return self.state.blocked_messages[self.current_account]
  61.  
  62.     
  63.     def _get_messages_to_delete(self):
  64.         return self.state.delete_messages[self.current_account]
  65.  
  66.     
  67.     def _set_delayed_db(self, value):
  68.         self.state.delayed_messages[self.current_account] = value
  69.         self.state.delayed_messages.store()
  70.  
  71.     
  72.     def _set_blocked_db(self, value):
  73.         self.state.blocked_messages[self.current_account] = value
  74.         self.state.blocked_messages.store()
  75.  
  76.     
  77.     def _set_messages_to_delete(self, value):
  78.         self.state.delete_messages[self.current_account] = value
  79.         self.state.delete_messages.store()
  80.  
  81.     delayed_db = property(_get_delayed_db, _set_delayed_db)
  82.     blocked_db = property(_get_blocked_db, _set_blocked_db)
  83.     messages_to_delete = property(_get_messages_to_delete, _set_messages_to_delete)
  84.     
  85.     def _classify_by_address(cls, msg):
  86.         """Use the 'From' address to classify.
  87.         """
  88.         addr_classifier = cls.state.address_classifier
  89.         from_addr = msg['From']
  90.         (prob, clues) = addr_classifier.spamprob(from_addr, evidence = True)
  91.         if prob == 1.0:
  92.             return (1.0, IS_SPAM, clues)
  93.         elif prob == 0.0:
  94.             return (0.0, IS_HAM, clues)
  95.         
  96.         return (0.5, IS_UNKNOWN, clues)
  97.  
  98.     _classify_by_address = classmethod(_classify_by_address)
  99.     
  100.     def _classify_by_statistical(cls, msg):
  101.         '''Use statistical analysis to classify.
  102.         '''
  103.         minimum_ham = options[('Classifier', 'minimum_ham')]
  104.         minimum_spam = options[('Classifier', 'minimum_spam')]
  105.         if cls.state.bayes.nham < minimum_ham or cls.state.bayes.nspam < minimum_spam:
  106.             clues = [
  107.                 ('*HAMCOUNT*', cls.state.bayes.nham),
  108.                 ('*SPAMCOUNT*', cls.state.bayes.nspam)]
  109.             return (0.5, IS_UNKNOWN, clues)
  110.         
  111.         (score, clues) = cls.state.bayes.spamprob(msg.tokenize(), evidence = True)
  112.         if score > options[('Categorization', 'spam_cutoff')]:
  113.             klass = IS_SPAM
  114.         elif score < options[('Categorization', 'ham_cutoff')]:
  115.             klass = IS_HAM
  116.         else:
  117.             klass = IS_UNKNOWN
  118.         return (score, klass, clues)
  119.  
  120.     _classify_by_statistical = classmethod(_classify_by_statistical)
  121.     
  122.     def _classify_by_fingerprint(cls, msg):
  123.         '''Use the fingerprinting system to classify.
  124.         '''
  125.         FP = cls.fingerprint_engine.get_fingerprint(msg)
  126.         if not FP:
  127.             return (0.5, IS_UNKNOWN, [
  128.                 ('fp-insufficient', '')])
  129.         
  130.         mapping = {
  131.             'fingerprints': list(FP),
  132.             'user_id': se_config.spamexpertsConfig.user_id }
  133.         if cls.fingerprint_disabled and time.time() > cls.fingerprint_disabled:
  134.             cls.fingerprints = 0
  135.             cls.fingerprint_failures = 0
  136.             cls.fingerprint_disabled = None
  137.         elif cls.fingerprints and cls.fingerprint_failures / cls.fingerprints > 0.29999999999999999:
  138.             cls.fingerprint_disabled = time.time() + 3600
  139.         
  140.         if cls.fingerprint_disabled:
  141.             return (0.5, IS_UNKNOWN, [
  142.                 ('fp-disabled-count', cls.fingerprints),
  143.                 ('fp-disabled-failure', cls.fingerprint_failures),
  144.                 ('fp-disabled-time', cls.fingerprint_disabled - time.time())])
  145.         
  146.         old_timeout = socket.getdefaulttimeout()
  147.         socket.setdefaulttimeout(10)
  148.         cls.fingerprints += 1
  149.         for unused in xrange(3):
  150.             
  151.             try:
  152.                 result = cls.state.fingerprintclient.query(mapping)
  153.             except socket.timeout:
  154.                 cls
  155.                 cls
  156.                 print >>sys.stderr, 'Connection to FP server timed out.'
  157.                 score = 0.5
  158.                 clues = [
  159.                     ('fp-timeout', socket.getdefaulttimeout())]
  160.                 cls.fingerprint_failures += 1 / 3
  161.                 continue
  162.                 cls
  163.                 except (socket.error, xmlrpclib.Fault, StandardError):
  164.                     e = None
  165.                     print 'Error msg %s: Failed to query fingerprint server for %s.' % (e, msg.getId())
  166.                     if options[('globals', 'verbose')]:
  167.                         (error_type, error, tb) = sys.exc_info()
  168.                         traceback.print_exception(error_type, error, tb)
  169.                     
  170.                     score = 0.5
  171.                     clues = []
  172.                     cls.fingerprint_failures += 1 / 3
  173.                     continue
  174.                     cls
  175.                 elif result is None:
  176.                     score = 0.5
  177.                     clues = [
  178.                         ('fp-server-error', result)]
  179.                     cls.fingerprint_failures += 1 / 3
  180.                     continue
  181.                 else:
  182.                     (match_count, matches) = result
  183.                     if match_count is None or match_count == '':
  184.                         score = 0.5
  185.                         clues = [
  186.                             ('fp-server-error', result)]
  187.                         cls.fingerprint_failures += 1 / 3
  188.                         continue
  189.                     
  190.  
  191.             score = cls.fingerprint_engine.spamprob(match_count, FP)
  192.             clues = [
  193.                 ('FP:' + str(FP), float(match_count)),
  194.                 ('Matches:' + str(matches), score)]
  195.         
  196.         socket.setdefaulttimeout(old_timeout)
  197.         if score > options[('Categorization', 'fingerprint_spam_cutoff')]:
  198.             return (score, IS_SPAM, clues)
  199.         
  200.         return (score, IS_UNKNOWN, clues)
  201.  
  202.     _classify_by_fingerprint = classmethod(_classify_by_fingerprint)
  203.     
  204.     def _classify_by_DNS(cls, msg):
  205.         (score, clues) = cls.DNS.spamprob(msg, True)
  206.         if score > options[('Categorization', 'dns_spam_cutoff')]:
  207.             return (score, IS_SPAM, clues)
  208.         
  209.         return (score, IS_UNKNOWN, clues)
  210.  
  211.     _classify_by_DNS = classmethod(_classify_by_DNS)
  212.     
  213.     def classify_message(cls, msg, flags = 0):
  214.         '''Classify the given SEHeaderMessage, and return the
  215.         classification.
  216.         '''
  217.         (s_score, s_klass, s_clues) = cls._classify_by_statistical(msg)
  218.         
  219.         statistical = lambda unused: (s_score, s_klass, s_clues)
  220.         if se_config.spamexpertsConfig.block_spam:
  221.             systems = ((cls._classify_by_address, 'address', True), (statistical, 'statistical', False), (cls._classify_by_fingerprint, 'fingerprint', False), (cls._classify_by_DNS, 'dns', False))
  222.         else:
  223.             systems = ((cls._classify_by_address, 'address', True), (statistical, 'statistical', False), (cls._classify_by_fingerprint, 'fingerprint', False))
  224.         final_score = 0.5
  225.         final_klass = IS_UNSURE
  226.         final_clues = []
  227.         skip_training = False
  228.         for classifier, description, skip in systems:
  229.             start = time.time()
  230.             (score, klass, clues) = classifier(msg)
  231.             if options[('globals', 'verbose')]:
  232.                 print description, 'took', time.time() - start, 'seconds.'
  233.             
  234.             final_clues.extend(clues)
  235.             final_clues.append((description, score))
  236.             if klass != IS_UNKNOWN:
  237.                 final_score = score
  238.                 final_klass = klass
  239.                 skip_training = skip
  240.                 break
  241.                 continue
  242.         
  243.         if final_klass == IS_UNKNOWN and options[('globals', 'verbose')]:
  244.             print 'Final classification was unsure.'
  245.         
  246.         if not skip_training and not (flags & NO_TRAINING_FLAG) and not (final_klass == IS_UNSURE):
  247.             if final_klass == IS_SPAM:
  248.                 cls.upload_fingerprint(msg)
  249.             
  250.             cls.train_statistical(msg, s_klass, final_klass)
  251.         
  252.         if not flags & NO_TRAINING_FLAG:
  253.             cls.state.statistics.RecordClassification(final_score)
  254.         
  255.         return (final_score, final_klass, final_clues)
  256.  
  257.     classify_message = classmethod(classify_message)
  258.     
  259.     def upload_fingerprint(cls, msg):
  260.         '''Upload the fingerprint for this message to the server.
  261.         '''
  262.         cls.state.fingerprint_queue.put(msg)
  263.  
  264.     upload_fingerprint = classmethod(upload_fingerprint)
  265.     
  266.     def train_statistical(cls, msg, s_klass, klass):
  267.         if s_klass == IS_UNKNOWN:
  268.             if options[('globals', 'verbose')]:
  269.                 print 'Auto-training statistical system.'
  270.             
  271.             asSpam = klass == IS_SPAM
  272.             cls.state.training_queue.put((msg, asSpam))
  273.         
  274.  
  275.     train_statistical = classmethod(train_statistical)
  276.     
  277.     def process_message(cls, messageText, uid, msg_info, current_account):
  278.         msg_class = message.SEHeaderMessage
  279.         msg = email.message_from_string(messageText, _class = msg_class)
  280.         msg.setId(uid)
  281.         
  282.         try:
  283.             (score, classification, clues) = cls.classify_message(msg)
  284.         except:
  285.             (messageText, details) = insert_exception_header(messageText)
  286.             print >>sys.stderr, details
  287.             del msg
  288.             msg = email.message_from_string(messageText, _class = msg_class)
  289.             msg.setId(uid)
  290.             classification = IS_HAM
  291.             score = 0.0
  292.             clues = []
  293.  
  294.         result = classification
  295.         if classification == IS_SPAM:
  296.             classification = options[('Headers', 'header_spam_string')]
  297.             old = cls.state.blocked_messages[current_account]
  298.             old[msg.getId()] = msg_info
  299.             cls.state.blocked_messages[current_account] = old
  300.             cls.state.blocked_messages.store()
  301.             msg.rememberBlockingState(current_account, BLOCKED)
  302.             old = cls.state.delete_messages[current_account]
  303.             old[msg.getId()] = msg_info
  304.             cls.state.delete_messages[current_account] = old
  305.             cls.state.delete_messages.store()
  306.         elif classification == IS_UNSURE:
  307.             classification = options[('Headers', 'header_unsure_string')]
  308.         else:
  309.             classification = options[('Headers', 'header_ham_string')]
  310.         msg.rememberBlockingState(current_account, DELAYED)
  311.         if msg_info:
  312.             old = cls.state.delayed_messages[current_account]
  313.             old[msg.getId()] = msg_info
  314.             cls.state.delayed_messages[current_account] = old
  315.             cls.state.delayed_messages.store()
  316.         
  317.         msg.RememberClassification(classification)
  318.         msg.addHeaders(prob = score, clues = clues)
  319.         cls.store_and_count_classified_message(msg, classification)
  320.         return result
  321.  
  322.     process_message = classmethod(process_message)
  323.     
  324.     def store_and_count_classified_message(cls, msg, classification):
  325.         '''Add message to the appropriate corpus, and count the number
  326.         of ham/spam.
  327.         '''
  328.         if classification == options[('Headers', 'header_spam_string')]:
  329.             cls.state.numSpams += 1
  330.             corpus = cls.state.spamCorpus
  331.         elif classification == options[('Headers', 'header_unsure_string')]:
  332.             corpus = cls.state.unsureCorpus
  333.         else:
  334.             cls.state.numHams += 1
  335.             corpus = cls.state.hamCorpus
  336.         message = corpus.makeMessage(msg.getId(), msg.as_string())
  337.         corpus.addMessage(message, observer_flags = NO_TRAINING_FLAG)
  338.  
  339.     store_and_count_classified_message = classmethod(store_and_count_classified_message)
  340.     
  341.     def process_queue(cls):
  342.         '''Background message processing.  This method is started when an
  343.         object of this class is created (in the state) and run until
  344.         the static variable finish_processing is True.'''
  345.         below_normal = win32process.THREAD_PRIORITY_BELOW_NORMAL
  346.         win32process.SetThreadPriority(win32api.GetCurrentThread(), below_normal)
  347.         told_notifier = False
  348.         counts = {
  349.             IS_HAM: 0,
  350.             IS_SPAM: 0,
  351.             IS_UNSURE: 0 }
  352.         while True:
  353.             
  354.             try:
  355.                 process_data = cls.processing_queue.get_nowait()
  356.                 (msg, uid, info, current_account) = process_data
  357.                 cls.state.model_notifier.SetBeginUpdating()
  358.                 told_notifier = True
  359.             except Queue.Empty:
  360.                 if told_notifier:
  361.                     time.sleep(10)
  362.                     if not cls.processing_queue.empty():
  363.                         continue
  364.                     
  365.                     if counts[IS_HAM] or counts[IS_UNSURE]:
  366.                         total_count = counts[IS_HAM] + counts[IS_UNSURE]
  367.                         if total_count != 1:
  368.                             plural = _('messages are')
  369.                         else:
  370.                             plural = _('message is')
  371.                         msg = _('%d %s waiting to be downloaded.') % (total_count, plural)
  372.                         if counts[IS_SPAM] == 1:
  373.                             msg += _(' 1 spam was blocked.')
  374.                         elif counts[IS_SPAM] > 1:
  375.                             msg += _(' %d spam were blocked.') % (counts[IS_SPAM],)
  376.                         
  377.                         snd = se_config.spamexpertsConfig.notify_sound
  378.                         notifier = cls.state.model_notifier
  379.                         notifier.SetEndUpdating(msg, snd)
  380.                         counts[IS_HAM] = 0
  381.                         counts[IS_SPAM] = 0
  382.                         counts[IS_UNSURE] = 0
  383.                     else:
  384.                         cls.state.model_notifier.SetEndUpdating()
  385.                     told_notifier = False
  386.                 
  387.                 if cls.finish_processing:
  388.                     break
  389.                 
  390.                 time.sleep(1)
  391.                 continue
  392.  
  393.             if uid in cls.processed_ids:
  394.                 continue
  395.             
  396.             cls.processed_ids.append(uid)
  397.             result = cls.process_message(msg, uid, info, current_account)
  398.             counts[result] += 1
  399.         if options[('globals', 'verbose')]:
  400.             print 'Processing queue ended.'
  401.         
  402.  
  403.     process_queue = classmethod(process_queue)
  404.     
  405.     def get_blocking_welcome_message(username, server):
  406.         server_dn = dnslookup.lookup[server]
  407.         e = '%s@%s' % (username, server_dn)
  408.         hr_diff = time.localtime()[3] - time.gmtime()[3]
  409.         min_diff = time.localtime()[4] - time.gmtime()[4]
  410.         received_time = time.strftime('%%d %%b %%Y %%H:%%M:%%S %+.2d%.2d' % (hr_diff, min_diff))
  411.         msg = ('Return-path: <info@spamexperts.com>', 'Received: (via local SpamExperts system); %s' % (received_time,), 'Envelope-to: info@spamexperts.com', 'X-URL: http://www.spamexperts.com/', _('Subject: Welcome to SpamExperts!'), 'From: SpamExperts <info@spamexperts.com>', 'Sender: info@spamexperts.com', 'Reply-To: info@spamexperts.com', 'To: %s' % (e,), 'Date: %s' % (time.strftime('%a, %d %B %Y %H:%M:%S'),), 'Message-ID: <spamexperts_welcome@%s>' % (server_dn,), '', _("SpamExperts will now block any 'spam' received for your %s email account from mail server %s.") % (username, server_dn), '', _('There is no need for you to change your email settings. When you normally check your email, our application first retrieves and filters all email from your mail account. After this the email is ready to be retrieved by your mail client.'), '', _('Whenever our application makes a mistake, you can easily correct this by opening our application (double click the SpamExperts icon at the bottom-right next to the Windows clock). Simply drag the misclassified email to the appropriate category.'), '', _('Please note that it may take up to 14 days for the application to have learned enough information to correctly classify your email.'), '', _('For more information or questions please visit http://www.spamexperts.com/'), '', _('The SpamExperts team'))
  412.         return '\r\n'.join(msg).encode('latin-1')
  413.  
  414.     get_blocking_welcome_message = staticmethod(get_blocking_welcome_message)
  415.     
  416.     def get_welcome_message(username, server):
  417.         server_dn = dnslookup.lookup[server]
  418.         e = '%s@%s' % (username, server_dn)
  419.         hr_diff = time.localtime()[3] - time.gmtime()[3]
  420.         min_diff = time.localtime()[4] - time.gmtime()[4]
  421.         received_time = time.strftime('%%d %%b %%Y %%H:%%M:%%S %+.2d%.2d' % (hr_diff, min_diff))
  422.         msg = ('Return-path: <info@spamexperts.com>', 'Received: (via local SpamExperts system); %s' % (received_time,), 'Envelope-to: info@spamexperts.com', 'X-URL: http://www.spamexperts.com/', _('Subject: Welcome to SpamExperts!'), 'From: SpamExperts <info@spamexperts.com>', 'Sender: info@spamexperts.com', 'Reply-To: info@spamexperts.com', 'To: %s' % (e,), 'Date: %s' % (time.strftime('%a, %d %B %Y %H:%M:%S'),), '', _("SpamExperts will classify any 'spam' received for your %s email account from mail server %s.") % (username, server_dn), '', _('There is no need for you to change your email settings. When you normally check your email, our application first retrieves and filters all email from your mail account. After this the email is ready to be retrieved by your mail client.'), '', _('Whenever our application makes a mistake, you can easily correct this by opening our application (double click the SpamExperts icon at the bottom-right next to the Windows clock). Simply drag the misclassified email to the appropriate category.'), '', _('Please note that it may take up to 14 days for the application to have learned enough information to correctly classify your email.'), '', _('For more information or questions please visit http://www.spamexperts.com/'), '', _('The SpamExperts team'))
  423.         return '\r\n'.join(msg).encode('latin-1')
  424.  
  425.     get_welcome_message = staticmethod(get_welcome_message)
  426.  
  427.